*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*	This program identifies screened NYC middle schools.
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
args bw
*	----------------------------------------------------------------------------

foreach year in 2016 2017 2018  {

	* Load pscores without bandwidth risk (MDRD1)
	use "${cleandata}program_pscore_nobw_ms_`year'.dta", clear
	gen programcode_suffix = programcode
	* Save in tempfile
	keep stu programcode* pscore_form pscore_freq
	ren pscore_* pscore_*_no_bw
	tempfile nobw
	sa "`nobw'"

	* Load general risk pscores (MDRD2)
	use "${cleandata}program_pscore`bw'_ms_`year'.dta", clear
	gen programcode_suffix = programcode
	replace programcode = stripped_programcode

	* Check if we assigned same program as in the match
	gen same_prog = mrmatch  == stripped_match

	* Generate indicator for missing lotto
	gen missing_lotto = lottery_rank   == .

	* Generate School ID from last 4 digits of the dbn variable
	preserve
		keep finaloffer finalofferdbn
		duplicates drop
		ren finaloffer programcode
		ren finalofferdbn dbn //check if this is supposed to be sch
		tempfile dbns
		sa "${cleandata}/app_school_dbns_ms_`year'.dta", replace
	restore
	merge m:1 programcode using "${cleandata}/app_school_dbns_ms_`year'.dta", keep(1 3)  gen(dbn_merge)

	ren dbn app_school_dbn

	* Merge MDRD1 pscore
	merge 1:1 stu programcode_suffix using "`nobw'", nogen
	drop programcode_suffix

	* ----------------------------------------------------------------------------
	* Collapsing ed opt buckets to the same program
	* ----------------------------------------------------------------------------

	* Generate simplified school ID
	gen app_dbn_last_4 = substr(app_school_dbn, 3, 4)

	* Generate indicator for screened school using lottery, excluding Ed.Opts.
	gen screened_with_lottery = screened_uses_lottery == 1

	* Generate indicator for number of offers by program
	bys stu programcode : egen offer_prg = max( offer)

	* Merge in our simulated offers
	merge m:1 stu using "${cleandata}best_guess_simulated_match_ms_`year'_nonsped.dta", nogen keep(1 3) keepusing(ourmatch)
	merge m:1 stu using "${cleandata}best_guess_simulated_match_ms_`year'_sped.dta", nogen keep(1 3 4 5) keepusing(ourmatch) update

	* Generate offer variable for our match
	 gen sim_offer_prg = my_offer
	* Aggegrate pscores for Ed.Opts. buckets by program
	* (generate program-level pscore)
	foreach score in pscore_form pscore_form_bw pscore_form_no_bw ///
		pscore_freq pscore_freq_no_bw pscore_qbw {
		bys stu programcode : egen double `score'_prog = total( `score' )
	}

	* Generate has_bw flag at the program level
	bys stu programcode: egen has_bw_prg = max(has_bw)

	keep stu programcode desc offer_prg sim_offer_prg app_school_dbn pscore_form_prog ///
		pscore_*_prog  choice app_dbn_last_4 missing_lotto same screened_with_lottery has_bw_prg description
	* Revert choice variable to original for DIA programs
	bys stu programcode: egen min_choice = min(choice)
	replace choice = min_choice
	drop min_choice

	* For DIA programs,
	bys programcode: egen screened_with_lottery_temp = max(screened_with_lottery)
	replace screened_with_lottery = screened_with_lottery_temp
	drop screened_with_lottery_temp

	duplicates drop
	isid stu programcode

	tempfile original
	sa "`original'"

	* ----------------------------------------------------------------------------
	* II Creating match-based dummies for instruments and pscores
	* ----------------------------------------------------------------------------

		* 	1) Create file with choices (this keeps ineligible apps)
			* This is used mainly for the endogenous variable. If an applicant
			* enrolls in their 1st/2nd/3rd choice, which is ineligible, they
			* can't get an offer in the match but still enroll at such a school.

			use "${cleandata}nyc_match_reshape`year'_ms.dta", clear
			destring stu, replace

			merge m:1 programcode using "${cleandata}/app_school_dbns_ms_`year'.dta", keep(1 3)

			gen app_dbn_last_4 = substr(dbn, 3, 4)

			keep stu choice app_dbn_last_4 programcode

			levelsof choice, local(lchoices)
			foreach choice of local lchoices {

				* We use the LISTED choice. This is different from the choice
				* in the match file because we dropped ineligible apps.
				gen listed_`choice' = choice == `choice'

				preserve
					keep if choice == `choice'
					keep stu app_dbn_last_4  programcode
					ren app_dbn_last_4 listed_`choice'_last_4
					ren programcode prog_`choice'
					duplicates drop
					tempfile choicedbn
					sa "`choicedbn'", replace
				restore

				merge m:1 stu using "`choicedbn'", nogen
			}

			keep stu listed_*_last_4 prog_1 prog_2 prog_3
			duplicates drop
			tempfile full_choices
			sa "`full_choices'"

			* Merge choices back in to the original sample, only keeping applications
			* from the original sample
			use "`original'", clear

			merge m:1 stu using "`full_choices'", gen(choice_merge) keep( 1 3)

		* 	2) Generate indicator for top 3 choices based on realized match
			* (after dropping ineligible applications)
			gen list_3_choice  = ( choice <= 3 )

		* 	3) Choice 1 / 2-12 , 1-3 / 4-12
			gen list_1 = ( inlist(choice,1) )
			gen list_2_12 = ( inrange(choice,2,12) )

			gen list_1_3 = ( inrange(choice,1,3) )
			gen list_4_12 = ( inrange(choice,4,12) )

		* 	4) Top 3 choices based on actual ranking (keeping ineligible)
			gen top_3_school = inlist(app_dbn_last_4, listed_1_last_4, listed_2_last_4, listed_3_last_4 )

		* 	5) Identify programs that screen applicants
			gen screened_prog = inlist(description,"Screened", "Test Outcome", "Screened For Language","False Test Outcome")
			gen unscreened_prog = inlist(description , "Unscreened" ,"Zoned" , "Limited Unscreened","Charter")

			* Identify strictly screened programs: labeled as screened program
			* and do not use lottery
			gen strict_scr = screened_prog == 1 &   screened_with_lottery == 0

			* Identify unscreened or screened programs that do use lottery
			gen strict_unscr = unscreened_prog == 1 |  screened_with_lottery == 1

			* Identify the share of applications per school that are for
			* screened seats
			bysort app_school_dbn: egen mean_scr = mean(screened_prog )
			gsort app_school_dbn mean_scr
			by app_school_dbn: replace mean_scr = mean_scr[_n-1] if mean_scr[_n-1]!=.

			* Identify the share of applications per school that are for
			* unscreened seats
			bysort app_school_dbn: egen mean_unscr = mean(unscreened_prog)
			gsort app_school_dbn mean_unscr
			by app_school_dbn: replace mean_unscr = mean_unscr[_n-1] if mean_unscr[_n-1]!=.

			* Identify the share of applications per school that are for
			* strictly screened seats
			bys app_dbn_last_4: egen mean_strict_scr =  mean(strict_scr )
			bys app_dbn_last_4: egen mean_strict_unscr =  mean(strict_unscr )

			* Check that means add up to 1
			*gen sum_test =  mean_scr + mean_unscr + mean_edopt
			gen sum_test =  mean_scr + mean_unscr
			su sum_test
			assert `r(min)' > 0.999  & `r(max)' == 1   // some rounding issues
			drop sum_test

			/* Define several versions of screened/unscreened/edopts

			1) Edopt with screen share is screened
					-> Edopts are all screened */

			gen scr1 = mean_scr > 0
			gen unscr1 = mean_unscr == 1

			/*
			2) Edopts are a third category, where all programs at that school are edopts
					When unscreened and edopts are mixed, call it edopts
					When screened and edopts are mixed, call it screened */

			gen scr2 = mean_scr > 0
			gen unscr2 = mean_unscr == 1

			/*
			3) Screened schools are only screened if all programs are screened ,
			and we add in a mixed sector that includes edopts */

			gen scr3 = mean_strict_scr == 1 & has_bw_prg == 1
			gen unscr3 = mean_unscr == 1 | !inlist(1, mean_strict_scr, mean_unscr) | has_bw_prg != 1

			egen test = rowtotal(scr3 unscr3 )
			su test
			assert `r(min)' == 1  & `r(max)' == 1
			drop test

			/*4) Similar to 3 but break out edopts*/

			gen scr4 	= mean_strict_scr == 1
			gen unscr4 	= mean_unscr == 1 | !inlist(1, mean_strict_scr, mean_unscr)

			egen test = rowtotal(scr4 unscr4)
			su test
			assert `r(min)' == 1  & `r(max)' == 1
			drop test

	* ----------------------------------------------------------------------------
	* III Creating and merge our own dummies for instruments and pscores
	* ----------------------------------------------------------------------------

			* Generate school-level scr3 and unscr3 definitions
			bys app_dbn_last_4 : egen scr3_temp = max(scr3)
			assert inlist(scr3_temp, 0, 1)
			gen unscr3_temp = 1 - scr3_temp

			* Test that sum is 1 (by construction, based on scr3 definition above)
			egen test = rowtotal(scr3_temp unscr3_temp )
			su test
			assert `r(min)' == 1  & `r(max)' == 1
			drop test

			* Replace scr3 and unscr3 with school-level variables for this dataset
			drop scr3 unscr3
			ren (scr3_temp unscr3_temp) (scr3 unscr3)

		*	8) Adding charter flag

			gen charter = description == "Charter"

		* Save screened DBNs

			keep app_dbn_last_4	scr1 unscr1 scr2 unscr2 scr3 unscr3 scr4 unscr4

			duplicates drop
			isid app_dbn_last_4 , missok
			save "${builddata}screened_dbns_ms_`year'.dta", replace
	
}
